This is the format we expect you to submit your homeworks in, allowing you to interlace code and written answers.
This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
plot(cars)
plot(cars)
plot(cars)
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.
Used on github and in Rmarkdown.
Plain text.
End a line with two spaces to start a new paragraph.
*italics* and _italics_
**bold** and __bold__
superscript^2^
~~strikethrough~~
[link](www.rstudio.com)
Plain text. End a line with two spaces to start a new paragraph.
italics and italics bold and bold superscript2 strikethrough cheatsheet 1 cheatsheet 2
# Header 1
## Header 2
### Header 3
#### Header 4
##### Header 5
###### Header 6
endash: --
emdash: ---
ellipsis: ...
inline equation: $A = \pi*r^{2}$
image: 
horizontal rule (or slide break):
***
endash: – emdash: — ellipsis: … inline equation: \(A = \pi*r^{2}\) image: horizontal rule (or slide break):
> block quote
* unordered list
* item 2
+ sub-item 1
+ sub-item 2
1. ordered list
2. item 2
+ sub-item 1
+ sub-item 2
Table Header | Second Header
------------- | -------------
Table Cell | Cell 2
Cell 3 | Cell 4
block quote
| Table Header | Second Header |
|---|---|
| Table Cell | Cell 2 |
| Cell 3 | Cell 4 |
# Imports
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.1 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
library(readxl)
# Choose the excel since a lot of the datasets need excel
url <- "https://www.eia.gov/totalenergy/data/browser/xls.php?tbl=T02.01&freq=m"
data_folder <- "data"
# Create a directory to store data
dir.create(data_folder, showWarnings = T)
## Warning in dir.create(data_folder, showWarnings = T): 'data' already exists
# Create the path I want to save it to
file <- file.path(data_folder, basename(url))
# Download the file
download.file(url, file)
df <- read_excel("data/xls.php?tbl=T02.01&freq=m", skip = 10)
# View the table
df
# Check the structure - a dataframe is a list of vectors
str(df)
## tibble [571 × 12] (S3: tbl_df/tbl/data.frame)
## $ Month : POSIXct[1:571], format: NA "1973-01-01" ...
## $ Primary Energy Consumed by the Residential Sector : chr [1:571] "(Trillion Btu)" "1339.27" "1174.899" "983.374" ...
## $ Total Energy Consumed by the Residential Sector : chr [1:571] "(Trillion Btu)" "1957.641" "1712.143" "1510.079" ...
## $ Primary Energy Consumed by the Commercial Sector : chr [1:571] "(Trillion Btu)" "657.977" "623.759" "492.136" ...
## $ Total Energy Consumed by the Commercial Sector : chr [1:571] "(Trillion Btu)" "1080.68" "1003.913" "886.42" ...
## $ Primary Energy Consumed by the Industrial Sector : chr [1:571] "(Trillion Btu)" "2027.033" "1841.299" "1926.411" ...
## $ Total Energy Consumed by the Industrial Sector : chr [1:571] "(Trillion Btu)" "2673.882" "2432.592" "2561.729" ...
## $ Primary Energy Consumed by the Transportation Sector: chr [1:571] "(Trillion Btu)" "1504.805" "1438.298" "1561.293" ...
## $ Total Energy Consumed by the Transportation Sector : chr [1:571] "(Trillion Btu)" "1507.977" "1441.064" "1564.146" ...
## $ Primary Energy Consumed by the Electric Power Sector: chr [1:571] "(Trillion Btu)" "1691.096" "1511.458" "1559.159" ...
## $ Energy Consumption Balancing Item : chr [1:571] "(Trillion Btu)" "3.693" "2.654" "-0.934" ...
## $ Primary Energy Consumption Total : chr [1:571] "(Trillion Btu)" "7223.873" "6592.366" "6521.439" ...
# Convert to data.table
# Remove the row with units
# Convert everything except dates to a numeric datatype
# Rename columns to make them easier to handle in data.table
Rename columns to make them easier to handle in data.table
# Get the current names
names(df)
## [1] "Month"
## [2] "Primary Energy Consumed by the Residential Sector"
## [3] "Total Energy Consumed by the Residential Sector"
## [4] "Primary Energy Consumed by the Commercial Sector"
## [5] "Total Energy Consumed by the Commercial Sector"
## [6] "Primary Energy Consumed by the Industrial Sector"
## [7] "Total Energy Consumed by the Industrial Sector"
## [8] "Primary Energy Consumed by the Transportation Sector"
## [9] "Total Energy Consumed by the Transportation Sector"
## [10] "Primary Energy Consumed by the Electric Power Sector"
## [11] "Energy Consumption Balancing Item"
## [12] "Primary Energy Consumption Total"
# Store corresponding names in a vector
columns <- c('month', 'res_prim','res_tot',
'com_prim','com_tot','ind_prim','ind_tot',
'trans_prim','trans_tot','power_prim',
'balancing','total')
names(columns) <- names(df)
# Set new names
setnames(df,columns)
# Convert to data.table
dt <- as.data.table(df)
# Remove the row with units dt[i,j,by]
dt <- dt[!1]
# What are the datatypes?
str(dt)
## Classes 'data.table' and 'data.frame': 570 obs. of 12 variables:
## $ month : POSIXct, format: "1973-01-01" "1973-02-01" ...
## $ res_prim : chr "1339.27" "1174.899" "983.374" "715.391" ...
## $ res_tot : chr "1957.641" "1712.143" "1510.079" "1183.421" ...
## $ com_prim : chr "657.977" "623.759" "492.136" "363.393" ...
## $ com_tot : chr "1080.68" "1003.913" "886.42" "736.851" ...
## $ ind_prim : chr "2027.033" "1841.299" "1926.411" "1914.017" ...
## $ ind_tot : chr "2673.882" "2432.592" "2561.729" "2540.02" ...
## $ trans_prim: chr "1504.805" "1438.298" "1561.293" "1480.403" ...
## $ trans_tot : chr "1507.977" "1441.064" "1564.146" "1483.064" ...
## $ power_prim: chr "1691.096" "1511.458" "1559.159" "1470.152" ...
## $ balancing : chr "3.693" "2.654" "-0.934" "-1.627" ...
## $ total : chr "7223.873" "6592.366" "6521.439" "5941.729" ...
## - attr(*, ".internal.selfref")=<externalptr>
# Get the column names we want to convert to numeric datatype
num_cols <- columns[2:length(columns)]
num_cols
## res_prim res_tot com_prim com_tot ind_prim ind_tot
## "res_prim" "res_tot" "com_prim" "com_tot" "ind_prim" "ind_tot"
## trans_prim trans_tot power_prim balancing total
## "trans_prim" "trans_tot" "power_prim" "balancing" "total"
# Convert to numeric datatype
# One-liner
dt[, (num_cols):= lapply(.SD, as.numeric), .SDcols = num_cols]
# Manual way
dt[, res_prim := as.numeric(res_prim)]
dt[, res_tot := as.numeric(res_tot )]
dt[, com_prim := as.numeric(com_prim)]
dt[, com_tot := as.numeric(com_tot )]
dt[, ind_prim := as.numeric(ind_prim)]
dt[, ind_tot := as.numeric(ind_tot )]
dt[, trans_prim := as.numeric(trans_prim)]
dt[, trans_tot := as.numeric(trans_tot )]
dt[, power_prim := as.numeric(power_prim)]
dt[, balancing := as.numeric(balancing)]
dt[, total := as.numeric(total)]
I want a graph for primary energy and total energy separately
# Manually filter the names I want
columns_prim <- c("month","res_prim","com_prim","ind_prim","trans_prim","power_prim")
# Regular Expressions way
cols <- names(dt)
# Search for the columns with month or prim
columns_prim <- grepl('month|prim',cols)
columns_prim
## [1] TRUE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE FALSE
# Use the boolean vector to filter your column vector
columns_prim <- cols[columns_prim]
columns_prim
## [1] "month" "res_prim" "com_prim" "ind_prim" "trans_prim"
## [6] "power_prim"
# Same process, only I'm searching for columns with month or ENDING with tot
columns_tot <- grepl('month|tot$',cols)
columns_tot <- cols[columns_tot]
# make two separate dts
dt_prim <- dt[,..columns_prim]
dt_tot <- dt[,..columns_tot]
# Melt it
dt_prim
dt_prim <- melt(dt_prim,'month',columns_prim[2:length(columns_prim)])
dt_prim
dt_tot
dt_tot <- melt(dt_tot,'month',columns_tot[2:length(columns_tot)])
dt_tot
# Setup the chart
ggplot(dt_tot, aes(x = month, y= value, fill = variable)) +
geom_area(position = 'stack') + # Make a stacked area chart
ggtitle("Total Energy Consumption by Sector") + # Title it
ylab("Consumption (Trillion btu)") + # label axes
xlab("Date") + # label axes
theme(legend.text = element_text(size = 8)) +
guides(fill = guide_legend(ncol = 1))
## Plot two types of charts
# Groupby variable and year and sum over the months
# Create a variable year by running the function year over the variable I have called month
dt_prim[,year:=year(month)]
# Set the 'key' or 'index' of the dataframe as variable-year
setkey(dt_prim,variable,year)
dt_prim
# Now I can easily groupby the key and do operations on each group
yearly_primary <- dt_prim[,list(consumption_prim = sum(value)),key(dt_prim)]
yearly_primary
# Setup the chart
ggplot(yearly_primary, aes(x = year, y= consumption_prim, fill = variable)) +
geom_bar(position = 'stack', stat = 'identity') + # Make a stacked bar chart
ggtitle("Primary Energy Consumption by Sector") + # Title it
ylab("Consumption (Trillion btu)") + # label axes
xlab("Date") + # label axes
theme(legend.text = element_text(size = 8)) +
guides(fill = guide_legend(ncol = 1))
You need a unique key that matches both dataframes * Creating this key might take some work! * e.g state_company_id_plant_id by concatenating I’m going to stack the total energy data and join it to yearly_primary
dt_tot[,year := year(month)]
setkey(dt_tot,variable,year)
yearly_tot <- dt_tot[,list(consumption_tot = sum(value)),key(dt_prim)]
yearly_tot <- dcast(yearly_tot, year ~ variable)
## Using 'consumption_tot' as value column. Use 'value.var' to override
yearly_prim <- dcast(yearly_primary, year ~ variable)
## Using 'consumption_prim' as value column. Use 'value.var' to override
# Yearly consumption by merging the two
merge(yearly_tot,yearly_prim)
## Key institutional points about electricity markets
## Natural Monopolies